In [1]:
import pandas as pd
import gc
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
from io import StringIO
import session_info
%matplotlib inline 
In [2]:
import itertools
import bioframe as bf
import bioframe.vis

import dash_bio
In [4]:
gc.collect()
Out[4]:
15
In [42]:
## read the data from Google Drive public link
## data = pd.read_csv('./datacp_upd_fxd_210822.csv.gz')

link1 = "https://drive.google.com/file/d/1oiMPamG0tZBbXTI8RuBSHEBZo7CtEaLP/view?usp=sharing"
link2 =f"https://drive.google.com/uc?export=download&confirm=t&id={link1.split('/')[-2]}"

response = requests.get(link2) ## get the gzipped CSV table 
raw = response.raw

with open('./outdata.gz', 'wb') as out_file:
    out_file.write( raw.read() ) ## write to file
## if something went wrong with your download you can get the data with link1
In [5]:
## read the data from downloaded gzipped CSV table
data = pd.read_csv('./outdata.gz')
<ipython-input-5-679bf3f10dcf>:1: DtypeWarning: Columns (20,21) have mixed types. Specify dtype option on import or set low_memory=False.
  data = pd.read_csv('./outdata.gz')
In [6]:
data.head()
Out[6]:
qseqid caption_x pident length mismatch gapopen qstart qend sstart send ... phylum class order family genus species data_type project prog fxd_taxid
0 NODE_125_length_10672_cov_442.064990 YP_009337182.1 32.860 1339 846 20 6629 2673 17 1322 ... Negarnaviricota Monjiviricetes Mononegavirales Xinmoviridae Anphevirus Dipteran anphevirus trx PRJNA353242 blastx 1922872
1 NODE_1_length_14482_cov_858.024121 YP_009337182.1 31.780 1438 926 22 6608 2355 17 1419 ... Negarnaviricota Monjiviricetes Mononegavirales Xinmoviridae Anphevirus Dipteran anphevirus trx PRJNA275431 blastx 1922872
2 NODE_1_length_14486_cov_119.472039 YP_009337182.1 31.780 1438 926 22 6605 2352 17 1419 ... Negarnaviricota Monjiviricetes Mononegavirales Xinmoviridae Anphevirus Dipteran anphevirus trx PRJNA275662 blastx 1922872
3 NODE_2_length_14498_cov_773.643564 YP_009337182.1 31.780 1438 926 22 6616 2363 17 1419 ... Negarnaviricota Monjiviricetes Mononegavirales Xinmoviridae Anphevirus Dipteran anphevirus trx PRJNA297027 blastx 1922872
4 NODE_13_length_514_cov_145.931393 YP_009337182.1 29.412 170 109 3 11 514 895 1055 ... Negarnaviricota Monjiviricetes Mononegavirales Xinmoviridae Anphevirus Dipteran anphevirus trx PRJNA438159 blastx 1922872

5 rows × 28 columns

In [7]:
datacp_upd_fxd_eval03 = data[data.evalue < 10**-3]
In [8]:
cont_foo = datacp_upd_fxd_eval03[datacp_upd_fxd_eval03.kindom == 'Viruses']
## cont_foo is the table of contigs hits with Viral nucleotide sequences (with BLASTx)
## with e-values < 10**-3
In [9]:
## the function to merge the overlapping intervals -- to assess the overall alignment length
def get_cover(tdf):
    return bf.merge( pd.DataFrame( {'chrom' : 'chr1',
                      'start' : tdf[['qstart', 'qend']].min(1),   ## the minimal value stands for the start
                      'end' : tdf[['qstart', 'qend']].max(1) } ), ## and the maximal value stands for the end
             min_dist=0)[['start', 'end']].dot(np.array([-1,1])).sum()
In [10]:
## we applied this aggregation since in general the lengths of viral genomes belonging to the same family are quite close
## here we calculated the total alignment lengths by viral family
cont_virfam_cover = cont_foo.groupby(['qseqid','family']).apply(get_cover).unstack()
In [11]:
## the clustermap of alignment lengths by contigs and viral families
## it was customized to drop the denrogram for contigs
## and to adjust the heght of the dendrogram for viral families
## as it was suggested in:
## https://www.data-to-viz.com/graph/dendrogram.html

cg = sns.clustermap(np.log( cont_virfam_cover.fillna(0).astype(int).iloc[:, 1:] + 1 ).drop_duplicates() , figsize = [20, 30],
                    dendrogram_ratio=(.01, .1) );
cg.ax_row_dendrogram.set_visible(False) #suppress row dendrogram
#cg.ax_col_dendrogram.set_visible(False) #suppress column dendrogram
cg.ax_cbar.set_visible(False);

cg.ax_heatmap.set_xlabel('')
cg.ax_heatmap.xaxis.tick_top() # x axis on top
cg.ax_heatmap.xaxis.set_label_position('top')
cg.ax_heatmap.set_xticklabels(cg.ax_heatmap.get_xticklabels(), rotation = 90);

plt.gcf().subplots_adjust(hspace=0.125, wspace=0.125);
In [12]:
## don't forget to collect the garbage from time to time
gc.collect()
Out[12]:
15
In [13]:
data_array = np.log( cont_virfam_cover.fillna(0).astype(int).iloc[:, 1:] + 1 ).drop_duplicates()

dbg = dash_bio.Clustergram(
    data=data_array,
    column_labels=list(data_array.columns.values),
    row_labels=list(data_array.index),
    height=13000,
    width=1000,
    
    #colorbar = None,
    
    optimal_leaf_order = True,
    
    display_ratio=[0.001, 0.01],
    
    link_method = 'single',
    
    color_map= [
        [0.0, '#636EFA'],
        [0.25, '#AB63FA'],
        [0.5, '#FFFFFF'],
        [0.75, '#E763FA'],
        [1.0, '#EF553B']
    ],
    #config = {'style': {'font-size': 12} }
   # return_computed_traces = True
)
In [14]:
## https://stackoverflow.com/questions/58630928
dbg.update_traces(dict(showscale=False), selector={'type':'heatmap'})
dbg.update_layout(font_family="Consolas",
                  font=dict(size= 8) )
In [15]:
session_info.show(excludes = ['pyarrow','distributed'])
Out[15]:
Click to view session information
-----
bioframe            0.3.3
dash_bio            1.0.2
matplotlib          3.3.1
numpy               1.20.2
pandas              1.4.3
plotly              4.12.0
requests            2.25.1
seaborn             0.11.0
session_info        1.0.0
-----
Click to view modules imported as dependencies
Bio                 1.78
GEOparse            2.0.3
PIL                 8.2.0
asciitree           NA
attr                20.3.0
backcall            0.2.0
blinker             1.4
brotli              NA
certifi             2022.06.15
cffi                1.14.5
cftime              1.4.1
chardet             3.0.4
click               7.1.2
cloudpickle         1.6.0
colorama            0.4.4
colour              NA
cycler              0.10.0
cython_runtime      NA
cytoolz             0.11.0
dash                1.17.0
dash_renderer       1.8.3
dask                2020.12.0
dateutil            2.8.1
decorator           5.0.6
fastcluster         1.1.26
fasteners           NA
fiona               1.8.13
flask               1.1.2
flask_compress      NA
future              0.18.2
geopandas           0.8.1
google              NA
idna                2.8
ipykernel           5.3.4
ipython_genutils    0.2.0
ipywidgets          7.6.3
itsdangerous        1.1.0
jedi                0.17.0
jinja2              2.11.3
joblib              1.0.1
jsonschema          3.2.0
kiwisolver          1.3.1
lxml                4.6.3
markupsafe          1.1.1
mpl_toolkits        NA
msgpack             1.0.2
nbformat            5.1.3
netCDF4             1.5.6
nt                  NA
ntsecuritycon       NA
numcodecs           0.7.3
numexpr             2.7.3
parmed              3.4.3
parso               0.8.2
periodictable       1.6.1
pickleshare         0.7.5
pkg_resources       NA
prompt_toolkit      3.0.17
psutil              5.8.0
pvectorc            NA
pygments            2.8.1
pyparsing           2.4.7
pyproj              2.6.1.post1
pyrsistent          NA
pythoncom           NA
pytz                2021.1
pywintypes          NA
retrying            NA
rtree               0.9.4
scipy               1.5.3
shapely             1.7.1
six                 1.12.0
sklearn             0.23.2
socks               1.7.1
statsmodels         0.12.2
stdlib_list         v0.8.0
storemagic          NA
tblib               1.7.0
tlz                 0.11.0
toolz               0.11.1
tornado             6.1
tqdm                4.59.0
traitlets           5.0.5
typing_extensions   NA
urllib3             1.25.11
wcwidth             0.2.5
werkzeug            1.0.1
win32api            NA
win32com            NA
win32security       NA
xarray              0.17.0
yaml                5.4.1
zarr                2.7.0
zmq                 20.0.0
-----
IPython             7.19.0
jupyter_client      6.1.12
jupyter_core        4.7.1
notebook            6.3.0
-----
Python 3.8.8 (default, Feb 24 2021, 15:54:32) [MSC v.1928 64 bit (AMD64)]
Windows-10-10.0.19041-SP0
-----
Session information updated at 2022-12-16 15:47